a.)
x <- seq(1, 100)
x
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100
b.)
w <- 2 + 0.5*x
w
## [1] 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5
## [16] 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 16.5 17.0
## [31] 17.5 18.0 18.5 19.0 19.5 20.0 20.5 21.0 21.5 22.0 22.5 23.0 23.5 24.0 24.5
## [46] 25.0 25.5 26.0 26.5 27.0 27.5 28.0 28.5 29.0 29.5 30.0 30.5 31.0 31.5 32.0
## [61] 32.5 33.0 33.5 34.0 34.5 35.0 35.5 36.0 36.5 37.0 37.5 38.0 38.5 39.0 39.5
## [76] 40.0 40.5 41.0 41.5 42.0 42.5 43.0 43.5 44.0 44.5 45.0 45.5 46.0 46.5 47.0
## [91] 47.5 48.0 48.5 49.0 49.5 50.0 50.5 51.0 51.5 52.0
c.)
set.seed(123)
myNormDist <- rnorm(100, sd = 5)
cat("The mean is ", mean(myNormDist), "\n")
## The mean is 0.4520295
cat("The standard deviation is ", sd(myNormDist), "\n")
## The standard deviation is 4.564079
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Norm.df <- data.frame(myNormDist)
ggplotly(ggplot(data = Norm.df, aes(x = myNormDist)) + geom_histogram(bins = 15, fill = "#8a5789"))
We notice that even though we create a normal distribution with defined mean and standard deviation, we observe a differing sample mean and standard deviation. Observing our histogram, we also can see that our distribution is not perfectly symmetric.
d.)
y <- myNormDist + w
y
## [1] -0.3023782 1.8491126 11.2935416 4.3525420 5.1464387 13.5753249
## [7] 7.8045810 -0.3253062 3.0657357 4.7716901 13.6204090 9.7990691
## [13] 10.5038573 9.5534136 6.7207943 18.9345657 12.9892524 1.1669142
## [19] 15.0067795 9.6360430 7.1608815 11.9101254 8.3699778 10.3555439
## [25] 11.3748037 6.5665334 19.6889352 16.7668656 10.8093153 23.2690746
## [31] 19.6323211 16.5246426 22.9756283 23.3906674 23.6079054 23.4432013
## [37] 23.2695883 20.6904414 19.9701867 20.0976450 19.0264651 21.9604136
## [43] 17.1730182 34.8447798 30.5398100 19.3844571 23.4855758 23.6667232
## [49] 30.3998256 26.5831547 28.7665926 27.8572662 28.2856477 35.8430114
## [55] 28.3711451 37.5823530 22.7562360 33.9230687 32.1192712 33.0797078
## [61] 34.3981974 30.4883827 31.8339631 28.9071231 29.1410439 36.5176432
## [67] 37.7410489 36.2650211 41.1113373 47.2504234 35.0448442 26.4541556
## [73] 43.5286926 35.4539962 36.0599569 45.1278568 39.0761350 34.8964114
## [79] 42.4065174 41.3055432 42.5288209 44.9264020 41.6466998 47.2218827
## [85] 43.3975672 46.6589098 50.9841951 48.1759075 44.8703421 52.7440381
## [91] 52.4675193 50.7419848 49.6936587 45.8604696 56.3032622 46.9987021
## [97] 61.4366650 58.6630531 50.3214982 46.8678955
e.)
Norm.df["y"] <- y
names(Norm.df) <- c("x", "y")
head(Norm.df)
## x y
## 1 -2.8023782 -0.3023782
## 2 -1.1508874 1.8491126
## 3 7.7935416 11.2935416
## 4 0.3525420 4.3525420
## 5 0.6464387 5.1464387
## 6 8.5753249 13.5753249
# Base size for labels is 11
# https://ggplot2.tidyverse.org/reference/ggtheme.html
ggplotly(ggplot(data = Norm.df, aes(x = x, y = y)) +
geom_point(color = "#7eb6ff") +
ggtitle("Y Versus X") +
theme_grey(base_size = 11*1.5))
f.)
Beta1 <-
sum((Norm.df[, 1] - mean(Norm.df[, 1])) * (Norm.df[, 2] - mean(Norm.df[, 2]))) / sum((Norm.df[, 1] - mean(Norm.df[, 1])) ^2)
Beta0 <- mean(Norm.df[,2]) - Beta1 * mean(Norm.df[, 1])
# Create column for predictions
Norm.df["Ypred"] <- Beta0 + Beta1 * Norm.df["x"]
ggplotly(ggplot(data = Norm.df, aes(x = x, y = y)) +
geom_point(color = "#7eb6ff") +
geom_line(data = Norm.df, aes(x = x, y = Ypred)) +
ggtitle("Y Versus X") +
theme_grey(base_size = 11*1.5))
Norm.df["ei"] <- Norm.df["y"] - Norm.df["Ypred"]
ggplotly(ggplot(data = Norm.df, aes(x = x, y = ei)) +
geom_point(color = "#75a876") +
ggtitle("Residuals Versus X") +
theme_grey(base_size = 11*1.5))
cat("The MSE is", sum(Norm.df["ei"]^2) / (nrow(Norm.df) - 2))
## The MSE is 211.2099
h.)
figList <- c()
for (i in rep(1,5)) {
set.seed(i)
xLoop <- rnorm(100, sd = 5)
yLoop <- myNormDist + w
Beta1Loop <-
sum((xLoop - mean(xLoop)) * (yLoop - mean(yLoop))) / sum((xLoop - mean(xLoop))^2)
Beta0Loop <- mean(yLoop) - Beta1 * mean(xLoop)
Loop.df <- data.frame(xLoop, yLoop, Beta0Loop + Beta1Loop * xLoop)
names(Loop.df) <- c("x", "y", "Ypred")
ggplotly(ggplot(data = Loop.df, aes(x = x, y = y)) +
geom_point(color = "#7eb6ff") +
geom_line(data = Loop.df, aes(x = x, y = Ypred)) +
ggtitle("Y Versus X") +
theme_grey(base_size = 11*1.5))
#figList <- c(figList, Loop.df)
}